# -*- coding: utf-8 -*-


'''
Read content from Haodoo.net pdb file.
'''

__license__   = 'GPL v3'
__copyright__ = '2012, Kan-Ru Chen <kanru@kanru.info>'
__docformat__ = 'restructuredtext en'


import struct
import os

from calibre import prepare_string_for_xml
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.txt.processor import opf_writer, HTML_TEMPLATE

BPDB_IDENT = 'BOOKMTIT'
UPDB_IDENT = 'BOOKMTIU'

punct_table = {
    "︵": "（",
    "︶": "）",
    "︷": "｛",
    "︸": "｝",
    "︹": "〔",
    "︺": "〕",
    "︻": "【",
    "︼": "】",
    "︗": "〖",
    "︘": "〗",
    "﹇": "［］",
    "﹈": "［］",
    "︽": "《",
    "︾": "》",
    "︿": "〈",
    "﹀": "〉",
    "﹁": "「",
    "﹂": "」",
    "﹃": "『",
    "﹄": "』",
    "｜": "—",
    "︙": "…",
    "ⸯ": "～",
    "│": "…",
    "￤": "…",
    "　": "  ",
    }


def fix_punct(line):
    for (key, value) in punct_table.items():
        line = line.replace(key, value)
    return line


class LegacyHeaderRecord:

    def __init__(self, raw):
        fields = raw.lstrip().replace(b'\x1b\x1b\x1b', b'\x1b').split(b'\x1b')
        self.title = fix_punct(fields[0].decode('cp950', 'replace'))
        self.num_records = int(fields[1])
        self.chapter_titles = list(map(
            lambda x: fix_punct(x.decode('cp950', 'replace').rstrip('\x00')),
            fields[2:]))


class UnicodeHeaderRecord:

    def __init__(self, raw):
        fields = raw.lstrip().replace(b'\x1b\x00\x1b\x00\x1b\x00',
                b'\x1b\x00').split(b'\x1b\x00')
        self.title = fix_punct(fields[0].decode('utf_16_le', 'ignore'))
        self.num_records = int(fields[1])
        self.chapter_titles = list(map(
            lambda x: fix_punct(x.decode('utf_16_le', 'replace').rstrip('\x00')),
            fields[2].split(b'\r\x00\n\x00')))


class Reader(FormatReader):

    def __init__(self, header, stream, log, options):
        self.stream = stream
        self.log = log

        self.sections = []
        for i in range(header.num_sections):
            self.sections.append(header.section_data(i))

        if header.ident == BPDB_IDENT:
            self.header_record = LegacyHeaderRecord(self.section_data(0))
            self.encoding = 'cp950'
        else:
            self.header_record = UnicodeHeaderRecord(self.section_data(0))
            self.encoding = 'utf_16_le'

    def author(self):
        self.stream.seek(35)
        version = struct.unpack('>b', self.stream.read(1))[0]
        if version == 2:
            self.stream.seek(0)
            author = self.stream.read(35).rstrip(b'\x00').decode(self.encoding, 'replace')
            return author
        else:
            return 'Unknown'

    def get_metadata(self):
        mi = MetaInformation(self.header_record.title,
                             [self.author()])
        mi.language = 'zh-tw'

        return mi

    def section_data(self, number):
        return self.sections[number]

    def decompress_text(self, number):
        return self.section_data(number).decode(self.encoding,
                'replace').rstrip('\x00')

    def extract_content(self, output_dir):
        txt = ''

        self.log.info('Decompressing text...')
        for i in range(1, self.header_record.num_records + 1):
            self.log.debug('\tDecompressing text section %i' % i)
            title = self.header_record.chapter_titles[i-1]
            lines = []
            title_added = False
            for line in self.decompress_text(i).splitlines():
                line = fix_punct(line)
                line = line.strip()
                if not title_added and title in line:
                    line = '<h1 class="chapter">' + line + '</h1>\n'
                    title_added = True
                else:
                    line = prepare_string_for_xml(line)
                lines.append('<p>%s</p>' % line)
            if not title_added:
                lines.insert(0, '<h1 class="chapter">' + title + '</h1>\n')
            txt += '\n'.join(lines)

        self.log.info('Converting text to OEB...')
        html = HTML_TEMPLATE % (self.header_record.title, txt)
        with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
            index.write(html.encode('utf-8'))

        mi = self.get_metadata()
        manifest = [('index.html', None)]
        spine = ['index.html']
        opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)

        return os.path.join(output_dir, 'metadata.opf')
